Our research topic is Trend of Major Types of Crimes commited by White Males in the DC Area in 2016-2021. We chose this topic because we are interested in the impact of COVID-19 on crimes. We will use the data provided by the Metropolitan Police of DC regarding adult arrests over a time period stretching between 2016-2021.
We read the data .CSV files of adults arrest in DC area from 2016-2021
df_2016<-data.frame(read.csv("Arrests 2016 Public.csv"))
df_2017<-data.frame(read.csv("Arrests 2017 Public.csv"))
df_2018<-data.frame(read.csv("Arrests by Year, 2018.csv"))
df_2019<-data.frame(read.csv("Arrests by Year, 2019.csv"))
df_2020<-data.frame(read.csv("Arrests by Year 2020.csv"))
df_2021<-data.frame(read.csv("2021 Adult Arrests.csv"))
c16 <- c(colnames(df_2016))
c18 <- c(colnames(df_2018))
The column names of the data in 2016 and 2017 were not the same with others. The below table shows the column names of the data in 2016 and the data in 2016.
| col # | 2016 | 2018 |
|---|---|---|
| 1 | Arrestee.Type | Arrestee.Type |
| 2 | Arrest.Year | Arrest.Year |
| 3 | Arrest.Date | Arrest.Date |
| 4 | Arrest.Hour | Arrest.Hour |
| 5 | CCN | CCN |
| 6 | Arrest.Number. | Arrest.Number. |
| 7 | Age | Age |
| 8 | Defendant.PSA | Defendant.PSA |
| 9 | Defendant.District | Defendant.District |
| 10 | Defendant.Race | Defendant.Race |
| 11 | Defendant.Ethnicity | Defendant.Ethnicity |
| 12 | Defendant.Sex | Defendant.Sex |
| 13 | Arrest.Category | Arrest.Category |
| 14 | Charge.Description | Charge.Description |
| 15 | Arrest.Location.PSA | Arrest.Location.PSA |
| 16 | Arrest.Location.District | Arrest.Location.District |
| 17 | Arrest.Location.Block.GeoX | Arrest.Block.GEOX |
| 18 | Arrest.Location.Block.GeoY | Arrest.Block.GEOY |
| 19 | Offense.GEOY | Arrest.Latitude |
| 20 | Offense.GEOX | Arrest.Longitude |
| 21 | Offense.PSA | Offense.Location.PSA |
| 22 | Offense.District | Offense.Location.District |
| 23 | Arrest.Latitude | Offense.Block.GEOX |
| 24 | Arrest.Longitude | Offense.Block.GEOY |
| 25 | Offense.Latitude | Offense.Latitude |
| 26 | Offense.Longitude | Offense.Longitude |
The column names were same from the first column to the 14th column
in both data. On the other hand, the name and order of 15th and latter
columns were a bit different in those data. The latter columns were
about locations, and we were not very interested in the detail location.
Therefore, we deleted the latter columns except for the 16th and 22nd
columns. In addition, we dropped CNN (col #5) and
Arrest.Number. (col #6) because they were IDs and useless
for our analysis.
The format of date was different from years; the data in 2016 and 2017 has the format like , the data in 2018 to 2020 has the format like , and the data in 2021 has the format like . We coverted Since different date formats for different years are difficult to analyze, we will unify the date format to “yyyy-mm-dd”.
After deleting some columns and changing the date format, we binded data frames by rows.
# convert format
df_2018$Arrest.Date <- as.Date(df_2018$Arrest.Date, format = "%m/%d/%Y") %>% format()
df_2019$Arrest.Date <- as.Date(df_2019$Arrest.Date, format = "%m/%d/%Y") %>% format()
df_2020$Arrest.Date <- as.Date(df_2020$Arrest.Date, format = "%m/%d/%Y") %>% format()
df_2021$Arrest.Date <- as.Date(df_2021$Arrest.Date, format = "%Y/%m/%d") %>% format()
#bind df_2016 and df_2017, and delete some columns
df_16_17 <- rbind(df_2016, df_2017)[,-c(5,6,15,17:21,23:26)]
names(df_16_17)[c(13,14)] <- c('Arrest.Location.District','Offense.Location.District') #rename columns
#bind df_2018 - df_2021, and delete some columns
df_18_21 <- rbind(df_2018, df_2019, df_2020, df_2021)[,-c(5,6,15,17:21,23:26)]
DF<-rbind(df_16_17,df_18_21)
To see whether there were abnormal values, we created the table showing some statistics for numerical variables.
xkablesummary(subset(DF,select=c(Arrest.Year, Arrest.Hour, Age)))
| Arrest.Year | Arrest.Hour | Age | |
|---|---|---|---|
| Min | Min. :2016 | Min. : 0.00 | Min. : 18.00 |
| Q1 | 1st Qu.:2017 | 1st Qu.: 6.00 | 1st Qu.: 25.00 |
| Median | Median :2018 | Median :12.00 | Median : 32.00 |
| Mean | Mean :2018 | Mean :11.81 | Mean : 35.19 |
| Q3 | 3rd Qu.:2019 | 3rd Qu.:18.00 | 3rd Qu.: 43.00 |
| Max | Max. :2021 | Max. :23.00 | Max. :121.00 |
The maximum age was too old. 55 rows were assigned an age of over 100 years (117-121 ) in these data, and it seemed to be wrong. Therefore, we dropped these rows.
DF <- DF[!DF$Age>=100,]
# replace dots with underscore for clarity sake, i think..
names(DF) = gsub("[.]", "_", names(DF))
colnames(DF)
## [1] "Arrestee_Type" "Arrest_Year"
## [3] "Arrest_Date" "Arrest_Hour"
## [5] "Age" "Defendant_PSA"
## [7] "Defendant_District" "Defendant_Race"
## [9] "Defendant_Ethnicity" "Defendant_Sex"
## [11] "Arrest_Category" "Charge_Description"
## [13] "Arrest_Location_District" "Offense_Location_District"
#find unique values in the race, sex and arrest_category columns.
#unique(DF$Defendant_Race)
#unique(DF$Defendant_Sex)
#unique(DF$Arrest_Category)
# most likely "UNK" is the same as "Unknown", so we can change this
DF$Defendant_Race[DF$Defendant_Race == 'UNK'] <- 'UNKNOWN'
#unique(DF$Defendant_Race) - check that it changed
#same issue, "unk" is very likely "unknown", so change it.
DF$Defendant_Sex[DF$Defendant_Sex == 'UNK'] <- 'UNKNOWN'
#unique(DF$Defendant_Sex) - check that it changed
# Arrest category - 4 different types of Fraud & Financial crimes , 3 types of Release Violations/Fugitive -- group them into one.
DF$Arrest_Category = gsub("Fraud and Financial Crimes.*","Fraud and Financial Crimes", DF$Arrest_Category)
DF$Arrest_Category = gsub("Release Violations/Fugitive.*","Release Violations/Fugitive",DF$Arrest_Category)
#sort(unique(DF$Arrest_Category)) - check that new changes were made.
sapply(DF, function(x) sum(is.na(x)))
## Arrestee_Type Arrest_Year Arrest_Date
## 0 0 0
## Arrest_Hour Age Defendant_PSA
## 0 0 29093
## Defendant_District Defendant_Race Defendant_Ethnicity
## 9337 0 0
## Defendant_Sex Arrest_Category Charge_Description
## 0 12 15
## Arrest_Location_District Offense_Location_District
## 184 11
#get month and day variables.. might be interesting, who knows?
DF <- separate(DF, col = Arrest_Date, into = c("Year","Month","Day"), sep = "-", remove = FALSE, fill="left")
#remove the new year column formed, it is redundant.. we already have Year column
DF = DF[,-4]
colnames(DF)
## [1] "Arrestee_Type" "Arrest_Year"
## [3] "Arrest_Date" "Month"
## [5] "Day" "Arrest_Hour"
## [7] "Age" "Defendant_PSA"
## [9] "Defendant_District" "Defendant_Race"
## [11] "Defendant_Ethnicity" "Defendant_Sex"
## [13] "Arrest_Category" "Charge_Description"
## [15] "Arrest_Location_District" "Offense_Location_District"
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Factorize some variables
DF$Arrest_Year = as.factor(DF$Arrest_Year)
DF$Month = as.factor(DF$Month)
DF$Day = as.factor(DF$Day)
DF$Defendant_Race = as.factor(DF$Defendant_Race)
DF$Defendant_Sex = as.factor(DF$Defendant_Sex)
DF$Arrest_Location_District = as.factor(DF$Arrest_Location_District)
DF$Offense_Location_District = as.factor(DF$Offense_Location_District)
# convert to date format
DF$Arrest_Date = as.Date(DF$Arrest_Date)
# Day format
DF$Day = day(DF$Arrest_Date)
# i want to create a week-day variable
DF$Weekday = weekdays(DF$Arrest_Date)
DF$Weekday = factor(DF$Weekday, levels = as.character(wday(c(2:7,1), label=TRUE, abbr=FALSE)))
# convert crime types to factors
DF$Arrest_Category = as.factor(DF$Arrest_Category)
Arrest_Category had some different values for 2021 and
other years:
Therefore, we coverted these values in 2021 into the correspond values in other years.
DF <- mutate(DF, Arrest_Category = gsub(Arrest_Category, pattern = "Release Violations/Fugitive.*", replacement = "Release Violations/Fugitive"))
DF <- mutate(DF, Arrest_Category = gsub(Arrest_Category, pattern = "Fraud and Financial Crimes.*", replacement = "Fraud and Financial Crimes"))
Since we were interested in crimes committed by while males, we
dropped rows where the value of Defendant_Race was not
“White”. The structure of the final data is shown in the below
table.
DF_WM <- subset(DF, subset = Defendant_Race=='WHITE' & Defendant_Sex=='MALE')
data.frame(column_name = names(DF_WM),
class = sapply(DF_WM, typeof),
first_values = sapply(DF_WM, function(x) paste0(head(x), collapse = ", ")),
row.names = NULL) %>%
kable("simple", caption = 'Data frame structure')
| column_name | class | first_values |
|---|---|---|
| Arrestee_Type | character | Adult Arrest, Adult Arrest, Adult Arrest, Adult Arrest, Adult Arrest, Adult Arrest |
| Arrest_Year | integer | 2016, 2016, 2016, 2016, 2016, 2016 |
| Arrest_Date | double | 2016-01-01, 2016-01-01, 2016-01-01, 2016-01-01, 2016-01-01, 2016-01-01 |
| Month | integer | 01, 01, 01, 01, 01, 01 |
| Day | integer | 1, 1, 1, 1, 1, 1 |
| Arrest_Hour | integer | 0, 0, 1, 1, 13, 2 |
| Age | integer | 39, 27, 27, 26, 48, 25 |
| Defendant_PSA | character | Out of State, Out of State, Out of State, Out of State, 404, Out of State |
| Defendant_District | character | Out of State, Out of State, Out of State, Out of State, 4D, Out of State |
| Defendant_Race | integer | WHITE, WHITE, WHITE, WHITE, WHITE, WHITE |
| Defendant_Ethnicity | character | UNKNOWN, NOT HISPANIC, HISPANIC, NOT HISPANIC, NOT HISPANIC, HISPANIC |
| Defendant_Sex | integer | MALE, MALE, MALE, MALE, MALE, MALE |
| Arrest_Category | character | Simple Assault, Simple Assault, Driving/Boating While Intoxicated, Simple Assault, Simple Assault, Simple Assault |
| Charge_Description | character | Threats To Do Bodily Harm -misd, Simple Assault, Driving While Intoxicated -2nd Off, Simple Assault, Simple Assault, Simple Assault |
| Arrest_Location_District | integer | 2D, 3D, 4D, 5D, 1D, 3D |
| Offense_Location_District | integer | 2D, 3D, 4D, 5D, 1D, 3D |
| Weekday | integer | Friday, Friday, Friday, Friday, Friday, Friday |
by_hour <- DF %>%
group_by(Arrest_Hour) %>%
dplyr::summarise(Total = n())
by_hour
## # A tibble: 24 × 2
## Arrest_Hour Total
## <int> <int>
## 1 0 5681
## 2 1 7425
## 3 2 6769
## 4 3 6286
## 5 4 5425
## 6 5 4412
## 7 6 4465
## 8 7 6101
## 9 8 6729
## 10 9 6594
## # … with 14 more rows
ggplot(by_hour, aes(Arrest_Hour, Total, color = Arrest_Hour)) +
geom_line() +
ggtitle("Crimes By Hour") +
xlab("Hour of the Day") +
ylab("Total Crimes")
by_day <- DF %>%
group_by(Day) %>%
dplyr::summarise(Total = n())
by_day
## # A tibble: 31 × 2
## Day Total
## <int> <int>
## 1 1 5516
## 2 2 5193
## 3 3 5240
## 4 4 5138
## 5 5 5189
## 6 6 4998
## 7 7 4878
## 8 8 5022
## 9 9 4947
## 10 10 5090
## # … with 21 more rows
ggplot(by_day, aes(Day, Total, color = Day)) +
geom_line() +
ggtitle("Crimes By Day") +
xlab("Day of the Month") +
ylab("Total Crimes")
by_weekday = DF %>% group_by(Weekday) %>%
dplyr::summarise(Total = n())
by_weekday$Percent <- by_weekday$Total/dim(DF)[1] * 100
by_weekday
## # A tibble: 7 × 3
## Weekday Total Percent
## <fct> <int> <dbl>
## 1 Monday 19553 12.8
## 2 Tuesday 21418 14.1
## 3 Wednesday 23520 15.4
## 4 Thursday 23241 15.3
## 5 Friday 23189 15.2
## 6 Saturday 22142 14.5
## 7 Sunday 19268 12.6
ggplot(by_weekday, aes(Weekday, Total, fill = Weekday)) +
geom_bar(stat = "identity") +
ggtitle("Crimes By Weekday ") +
xlab("Day of the Week") + ylab("Count") +
theme(legend.position = "none")
by_month <- DF %>%
group_by(Month) %>%
dplyr::summarise(Total = n())
by_month$Percent <- by_month$Total/dim(DF)[1] * 100
by_month
## # A tibble: 12 × 3
## Month Total Percent
## <fct> <int> <dbl>
## 1 01 12751 8.37
## 2 02 12158 7.98
## 3 03 13625 8.94
## 4 04 12344 8.10
## 5 05 13427 8.81
## 6 06 12729 8.36
## 7 07 13008 8.54
## 8 08 12991 8.53
## 9 09 12578 8.26
## 10 10 13029 8.55
## 11 11 11869 7.79
## 12 12 11822 7.76
#ggplot(by_month, aes(Month, Total, fill = Month)) +
#geom_bar(stat = "identity") +
#ggtitle("Crimes By Month") +
#xlab("Month") +
#ylab("Count") +
#theme(legend.position = "none")
ggplot(by_month, aes(x=Month, y=Total, group=1)) + geom_line()
by_year = DF %>% group_by(Arrest_Year) %>%
dplyr::summarise(Total = n())
by_year$Percent <- by_year$Total/dim(DF)[1] * 100
by_year
## # A tibble: 6 × 3
## Arrest_Year Total Percent
## <fct> <int> <dbl>
## 1 2016 29980 19.7
## 2 2017 31209 20.5
## 3 2018 29100 19.1
## 4 2019 27915 18.3
## 5 2020 18479 12.1
## 6 2021 15648 10.3
#ggplot(by_year, aes(Arrest_Year, Total, fill = Arrest_Year)) +
#geom_bar(stat = "identity") +
#ggtitle("Crimes By Year ") +
#xlab("Year") + ylab("Count") +
#theme(legend.position = "none")
ggplot(by_year, aes(x=Arrest_Year, y=Total, group=1)) + geom_line()
PADM = ggplot(DF, aes(group = factor(Arrest_Year), y = Age,x= Arrest_Year, fill = factor(Arrest_Year))) +
geom_boxplot() +
geom_boxplot(outlier.shape=8, outlier.size=5) +
labs(title="Age VS Year", x="Year", y = "Age")
PADM
This plot is to compare the age of people who get arrested with each different year. From the plot, we can see that there are lots of outliers. We need to get rid of the outliers first.
Q1 = quantile(DF$Age, .25)
Q3 = quantile(DF$Age, .75)
IQR = IQR(DF$Age)
#only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
ndf = subset(DF, DF$Age> (Q1 - 1.5*IQR) & DF$Age< (Q3 + 1.5*IQR))
#view row and column count of new data frame
dim(ndf)
## [1] 151552 17
dim(DF)
## [1] 152331 17
Removed 834 outliers.
ATY = ggplot(ndf, aes(group = factor(Arrest_Year), y = Age, x= Arrest_Year, fill = factor(Arrest_Year))) +
geom_boxplot() +
geom_boxplot(outlier.shape=8, outlier.size=5) +
labs(title="Age VS Year", x="Year", y = "Age")
ATY
After remove the ourliers, we could clearly see that as year goes up,
the minimum age goes up a little bit. The maximum age from this sample
goes down. The median is pretty much same compare to different
years.
There are less younger criminals as year passing from 2016 to 2020 based on this sample. Criminal with age from 30 to 35 arrested more than other ages, which also probably means that there are more offenders with age from 30 to 35.
Why boxplot? The advantage of consider median over sample mean is that it is less affected by extreme observations.
unique(DF$Defendant_Race)
## [1] WHITE BLACK UNKNOWN ASIAN MULTIPLE OTHER
## Levels: ASIAN BLACK MULTIPLE OTHER UNKNOWN WHITE
rrw = sum(DF$Defendant_Race == "WHITE")
rrb = sum(DF$Defendant_Race == "BLACK")
rra = sum(DF$Defendant_Race == "ASIAN")
rrw
## [1] 15739
rrb
## [1] 131385
rra
## [1] 897
unique(DF$Arrest_Category)
## [1] "Simple Assault" "Assault on a Police Officer"
## [3] "Traffic Violations" "Weapon Violations"
## [5] "Driving/Boating While Intoxicated" "Narcotics"
## [7] "Disorderly Conduct" "Theft"
## [9] "Liquor Law Violations" "Other Crimes"
## [11] "Theft from Auto" "Offenses Against Family & Children"
## [13] "Assault with a Dangerous Weapon" "Release Violations/Fugitive"
## [15] "Motor Vehicle Theft" "Damage to Property"
## [17] "Sex Abuse" "Property Crimes"
## [19] "Vending Violations" "Robbery"
## [21] "Aggravated Assault" "Burglary"
## [23] "Sex Offenses" "Fraud and Financial Crimes"
## [25] "Prostitution" "Homicide"
## [27] "Kidnapping" "Gambling"
## [29] "Arson" NA
ss = subset(DF,DF$Arrest_Category == "Sex Abuse")
HH = ggplot(ss, aes(group = factor(Arrest_Year), y = Arrest_Hour,x= Arrest_Year, fill = factor(Arrest_Year))) +
geom_boxplot() +
geom_boxplot(outlier.shape=8, outlier.size=5) +
labs(title="Arrest_Hour VS Year", x="Year", y = "Arrest_Hour")
HH
From the box plot, we can see that most sex abuse happens around 10am to 13pm. The sex abuse happens all the time and it changes with different years.
tt = subset(DF,DF$Arrest_Category == "Theft")
HHH = ggplot(tt, aes(group = factor(Arrest_Year), y = Arrest_Hour,x= Arrest_Year, fill = factor(Arrest_Year))) +
geom_boxplot() +
geom_boxplot(outlier.shape=8, outlier.size=5) +
labs(title="Arrest_Hour VS Year", x="Year", y = "Arrest_Hour")
HHH
From the box plot, we can see that most theft happens around 15pm and they all super same with each year except year 2019. The theft always happening from 11am to 19 pm. That’s a funny fact.
### Time to investigate our main focus group - White Males - EDA
#unique(DF$Defendant_Race)
#table(DF$Defendant_Sex)
df_wm = subset(DF, subset = Defendant_Race == "WHITE" & Defendant_Sex == "MALE")
head(df_wm, 20)
## Arrestee_Type Arrest_Year Arrest_Date Month Day Arrest_Hour Age
## 1 Adult Arrest 2016 2016-01-01 01 1 0 39
## 2 Adult Arrest 2016 2016-01-01 01 1 0 27
## 12 Adult Arrest 2016 2016-01-01 01 1 1 27
## 14 Adult Arrest 2016 2016-01-01 01 1 1 26
## 24 Adult Arrest 2016 2016-01-01 01 1 13 48
## 54 Adult Arrest 2016 2016-01-01 01 1 2 25
## 76 Adult Arrest 2016 2016-01-01 01 1 3 21
## 84 Adult Arrest 2016 2016-01-01 01 1 3 41
## 96 Adult Arrest 2016 2016-01-01 01 1 6 29
## 98 Adult Arrest 2016 2016-01-01 01 1 7 22
## 104 Adult Arrest 2016 2016-01-02 01 2 0 51
## 110 Adult Arrest 2016 2016-01-02 01 2 1 29
## 114 Adult Arrest 2016 2016-01-02 01 2 11 64
## 123 Adult Arrest 2016 2016-01-02 01 2 15 33
## 131 Adult Arrest 2016 2016-01-02 01 2 16 23
## 138 Adult Arrest 2016 2016-01-02 01 2 17 49
## 161 Adult Arrest 2016 2016-01-02 01 2 21 30
## 171 Adult Arrest 2016 2016-01-02 01 2 3 22
## 175 Adult Arrest 2016 2016-01-02 01 2 4 28
## 194 Adult Arrest 2016 2016-01-03 01 3 15 27
## Defendant_PSA Defendant_District Defendant_Race Defendant_Ethnicity
## 1 Out of State Out of State WHITE UNKNOWN
## 2 Out of State Out of State WHITE NOT HISPANIC
## 12 Out of State Out of State WHITE HISPANIC
## 14 Out of State Out of State WHITE NOT HISPANIC
## 24 404 4D WHITE NOT HISPANIC
## 54 Out of State Out of State WHITE HISPANIC
## 76 Out of State Out of State WHITE HISPANIC
## 84 307 3D WHITE HISPANIC
## 96 Out of State Out of State WHITE HISPANIC
## 98 402 4D WHITE HISPANIC
## 104 Out of State Out of State WHITE NOT HISPANIC
## 110 Out of State Out of State WHITE HISPANIC
## 114 Out of State Out of State WHITE UNKNOWN
## 123 302 3D WHITE HISPANIC
## 131 506 5D WHITE HISPANIC
## 138 Out of State Out of State WHITE HISPANIC
## 161 Out of State Out of State WHITE NOT HISPANIC
## 171 Out of State Out of State WHITE HISPANIC
## 175 201 2D WHITE NOT HISPANIC
## 194 403 4D WHITE HISPANIC
## Defendant_Sex Arrest_Category
## 1 MALE Simple Assault
## 2 MALE Simple Assault
## 12 MALE Driving/Boating While Intoxicated
## 14 MALE Simple Assault
## 24 MALE Simple Assault
## 54 MALE Simple Assault
## 76 MALE Liquor Law Violations
## 84 MALE Driving/Boating While Intoxicated
## 96 MALE Simple Assault
## 98 MALE Simple Assault
## 104 MALE Release Violations/Fugitive
## 110 MALE Traffic Violations
## 114 MALE Simple Assault
## 123 MALE Assault with a Dangerous Weapon
## 131 MALE Weapon Violations
## 138 MALE Liquor Law Violations
## 161 MALE Narcotics
## 171 MALE Simple Assault
## 175 MALE Damage to Property
## 194 MALE Traffic Violations
## Charge_Description
## 1 Threats To Do Bodily Harm -misd
## 2 Simple Assault
## 12 Driving While Intoxicated -2nd Off
## 14 Simple Assault
## 24 Simple Assault
## 54 Simple Assault
## 76 Poss Of Open Container Of Alcohol/public Intoxication
## 84 Driving Under Influence -2nd Off
## 96 Simple Assault
## 98 Simple Assault
## 104 Failure To Appear (USAO)
## 110 No Permit
## 114 Simple Assault
## 123 Assault With A Dangerous Weapon
## 131 Possess Prohibited Weapon
## 138 Possession Of An Open Container Of Alcohol (poca)
## 161 Poss W/i To Dist A Controlled Substance
## 171 Simple Assault
## 175 Destruction Of Property Less Than $1000
## 194 No Permit
## Arrest_Location_District Offense_Location_District Weekday
## 1 2D 2D Friday
## 2 3D 3D Friday
## 12 4D 4D Friday
## 14 5D 5D Friday
## 24 1D 1D Friday
## 54 3D 3D Friday
## 76 2D 2D Friday
## 84 2D 2D Friday
## 96 2D 2D Friday
## 98 4D 4D Friday
## 104 4D 1D Saturday
## 110 4D 4D Saturday
## 114 2D 2D Saturday
## 123 4D 4D Saturday
## 131 5D 5D Saturday
## 138 3D 3D Saturday
## 161 1D 1D Saturday
## 171 2D 2D Saturday
## 175 3D 3D Saturday
## 194 4D 4D Sunday
# i want to create a week-day variable
df_wm$Weekday = weekdays(df_wm$Arrest_Date)
df_wm$Weekday = factor(df_wm$Weekday, levels = as.character(wday(c(2:7,1), label=TRUE, abbr=FALSE)))
same pattern here as above, will dig into a few other stuff too..
wm_by_hour <- df_wm %>%
group_by(Arrest_Hour) %>%
dplyr::summarise(Total = n())
wm_by_hour
## # A tibble: 24 × 2
## Arrest_Hour Total
## <int> <int>
## 1 0 605
## 2 1 801
## 3 2 813
## 4 3 695
## 5 4 552
## 6 5 329
## 7 6 322
## 8 7 405
## 9 8 424
## 10 9 442
## # … with 14 more rows
ggplot(wm_by_hour, aes(Arrest_Hour, Total, color = Arrest_Hour)) +
geom_line() +
ggtitle("White Males - Crimes By Hour") +
xlab("Hour of the Day") +
ylab("Total Crimes")
wm_by_day <- df_wm %>%
group_by(Day) %>%
dplyr::summarise(Total = n())
wm_by_day
## # A tibble: 31 × 2
## Day Total
## <int> <int>
## 1 1 524
## 2 2 404
## 3 3 403
## 4 4 395
## 5 5 405
## 6 6 455
## 7 7 405
## 8 8 376
## 9 9 399
## 10 10 430
## # … with 21 more rows
ggplot(wm_by_day, aes(Day, Total, color = Day)) +
geom_line() +
ggtitle("White Males - Crimes By Day") +
xlab("Day of the Month") +
ylab("Total Crimes")
wm_by_weekday = df_wm %>% group_by(Weekday) %>%
dplyr::summarise(Total = n())
wm_by_weekday$Percent <- wm_by_weekday$Total/dim(df_wm)[1] * 100
wm_by_weekday
## # A tibble: 7 × 3
## Weekday Total Percent
## <fct> <int> <dbl>
## 1 Monday 1566 12.8
## 2 Tuesday 1463 11.9
## 3 Wednesday 1639 13.3
## 4 Thursday 1736 14.1
## 5 Friday 1876 15.3
## 6 Saturday 2046 16.7
## 7 Sunday 1952 15.9
ggplot(wm_by_weekday, aes(Weekday, Total, fill = Weekday)) +
geom_bar(stat = "identity") +
ggtitle("White Males - Crimes By Weekday ") +
xlab("Day of the Week") + ylab("Count") +
theme(legend.position = "none")
wm_by_month <- df_wm %>%
group_by(Month) %>%
dplyr::summarise(Total = n())
wm_by_month$Percent <- wm_by_month$Total/dim(df_wm)[1] * 100
wm_by_month
## # A tibble: 12 × 3
## Month Total Percent
## <fct> <int> <dbl>
## 1 01 1189 9.68
## 2 02 998 8.13
## 3 03 1125 9.16
## 4 04 921 7.50
## 5 05 1058 8.62
## 6 06 1010 8.23
## 7 07 966 7.87
## 8 08 961 7.83
## 9 09 1023 8.33
## 10 10 1108 9.02
## 11 11 991 8.07
## 12 12 928 7.56
ggplot(wm_by_month, aes(Month, Total, fill = Month)) +
geom_bar(stat = "identity") +
ggtitle("White Males - Crimes By Month") +
xlab("Month") +
ylab("Count") +
theme(legend.position = "none")
ggplot(wm_by_month, aes(x=Month, y=Total, group=1)) + geom_line()
wm_by_year = df_wm %>% group_by(Arrest_Year) %>%
dplyr::summarise(Total = n())
wm_by_year$Percent <- wm_by_year$Total/dim(df_wm)[1] * 100
wm_by_year
## # A tibble: 6 × 3
## Arrest_Year Total Percent
## <fct> <int> <dbl>
## 1 2016 2620 21.3
## 2 2017 2636 21.5
## 3 2018 2297 18.7
## 4 2019 2191 17.8
## 5 2020 1425 11.6
## 6 2021 1109 9.03
ggplot(wm_by_year, aes(Arrest_Year, Total, fill = Arrest_Year)) +
geom_bar(stat = "identity") +
ggtitle("White Males - Crimes By Year ") +
xlab("Year") + ylab("Count") +
theme(legend.position = "none")
ggplot(wm_by_year, aes(x=Arrest_Year, y=Total, group=1)) + geom_line()
wm_by_cat <- df_wm %>%
group_by(Arrest_Category) %>%
dplyr::summarise(Total = n()) %>%
arrange(desc(Total))
wm_by_cat[1:10,]
## # A tibble: 10 × 2
## Arrest_Category Total
## <chr> <int>
## 1 Simple Assault 2661
## 2 Traffic Violations 1548
## 3 Release Violations/Fugitive 1133
## 4 Driving/Boating While Intoxicated 1045
## 5 Other Crimes 821
## 6 Theft 674
## 7 Narcotics 654
## 8 Liquor Law Violations 562
## 9 Disorderly Conduct 433
## 10 Damage to Property 414
ggplot(wm_by_cat, aes(reorder(Arrest_Category, Total), Total)) +
geom_bar(stat = "identity") + coord_flip() +
scale_y_continuous(breaks = seq(0,3000,500)) +
ggtitle("Crimes By Arrest Category") +
xlab("Crime Type") +
ylab("Number of Incidents")
wm_by_cat_year <- df_wm %>% group_by(Arrest_Year, Arrest_Category) %>%
dplyr::summarise(Total = n())
## `summarise()` has grouped output by 'Arrest_Year'. You can override using the
## `.groups` argument.
wm_by_cat_year[1:10,]
## # A tibble: 10 × 3
## # Groups: Arrest_Year [1]
## Arrest_Year Arrest_Category Total
## <fct> <chr> <int>
## 1 2016 Aggravated Assault 23
## 2 2016 Assault on a Police Officer 42
## 3 2016 Assault with a Dangerous Weapon 73
## 4 2016 Burglary 25
## 5 2016 Damage to Property 98
## 6 2016 Disorderly Conduct 83
## 7 2016 Driving/Boating While Intoxicated 206
## 8 2016 Fraud and Financial Crimes 11
## 9 2016 Homicide 2
## 10 2016 Kidnapping 4
ggplot(wm_by_cat_year, aes(reorder(Arrest_Category, Total), Total, fill = Arrest_Year)) +
geom_bar(stat = "identity") +
scale_y_continuous(breaks = seq(0,3000,500)) +
coord_flip() + ggtitle("Crimes By Code and Year") +
xlab("Crime Text Code") +
ylab("Total Crimes")
unique(df_wm$Arrest_Location_District)
## [1] 2D 3D 4D 5D 1D 7D 6D UNKNOWN <NA>
## [10]
## Levels: 1D 2D 3D 4D 5D 6D 7D UNKNOWN
table(df_wm$Arrest_Location_District)
##
## 1D 2D 3D 4D 5D 6D 7D UNKNOWN
## 22 1858 3053 2555 2816 1231 437 260 29
unique(df_wm$Offense_Location_District)
## [1] 2D 3D 4D 5D 1D 7D 6D #N/A UNKNOWN
## [10] Unk
## Levels: #N/A 1D 2D 3D 4D 5D 6D 7D Unk UNKNOWN
table(df_wm$Offense_Location_District)
##
## #N/A 1D 2D 3D 4D 5D 6D 7D Unk UNKNOWN
## 12 2044 3100 2531 2719 1195 413 238 16 10
### drop the unknowns here .. they are few..
wm_by_ALD <- df_wm %>% group_by(Arrest_Location_District) %>%
dplyr::summarise(Total = n()) %>%
dplyr::arrange(desc(Total))
wm_by_ALD2 = wm_by_ALD[1:7,]
wm_by_OLD <- df_wm %>% group_by(Offense_Location_District) %>%
dplyr::summarise(Total = n()) %>%
dplyr::arrange(desc(Total))
wm_by_OLD2 = wm_by_OLD[1:7,]
ggplot(wm_by_ALD2, aes(reorder(Arrest_Location_District, -Total), Total)) +
geom_bar(stat = "identity") +
ggtitle("Crimes by Arrest Location District") +
xlab("Location District") +
ylab("Total Crimes")
ggplot(wm_by_OLD2, aes(reorder(Offense_Location_District, -Total), Total)) +
geom_bar(stat = "identity") +
ggtitle("Crimes by Offense Location District") +
xlab("Location District") +
ylab("Total Crimes")
# top 5 crimes in each district
#ALD_dc_top7 <- wm_by_ALD$Arrest_Location_District[1:5]
#ALD_top7_dc <- subset(df_wm, Arrest_Location_District %in% wm_by_ALD$Arrest_Location_District[1:5])
#ALD_top7_dc$Arrest_Location_District <- factor(ALD_top7_dc$Arrest_Location_District)
#ggplot(ALD_top7_dc, aes(Arrest_Category, fill = Arrest_Location_District)) +
#geom_bar(position = "dodge") +
#ggtitle("Crimes by District Police HeadQuarters - Top 5") +
#xlab("Police HQ") +
#ylab("Total Crimes")
#top crime by ARREST Location District
ALD_by_crime <- df_wm %>%
group_by(Arrest_Location_District, Arrest_Category) %>%
dplyr::summarise(Total = n()) %>%
arrange(desc(Total)) %>% top_n(n = 1)
## `summarise()` has grouped output by 'Arrest_Location_District'. You can
## override using the `.groups` argument.
## Selecting by Total
ALD_by_crime1 = ALD_by_crime[1:7,]
#dc_by_crime <- as.data.frame(dc_by_crime)
#dc_by_crime$Dc_Dist <- factor(dc_by_crime$Dc_Dist)
#dc_by_crime$Text_General_Code <- factor(dc_by_crime$Text_General_Code)
ggplot(ALD_by_crime1, aes(Arrest_Location_District, Total, fill = Arrest_Category)) +
geom_bar(stat = "identity") +
ggtitle("Top Crime by Arrest Location District") +
xlab("Location District") +
ylab("Total")
#top crime by OFFENSE Location District
OLD_by_crime <- df_wm %>%
group_by(Offense_Location_District, Arrest_Category) %>%
dplyr::summarise(Total = n()) %>%
arrange(desc(Total)) %>% top_n(n = 1)
## `summarise()` has grouped output by 'Offense_Location_District'. You can
## override using the `.groups` argument.
## Selecting by Total
OLD_by_crime1 = OLD_by_crime[1:7,]
#dc_by_crime <- as.data.frame(dc_by_crime)
#dc_by_crime$Dc_Dist <- factor(dc_by_crime$Dc_Dist)
#dc_by_crime$Text_General_Code <- factor(dc_by_crime$Text_General_Code)
ggplot(OLD_by_crime1, aes(Offense_Location_District, Total, fill = Arrest_Category)) +
geom_bar(stat = "identity") +
ggtitle("Top Crime by Offense Location District") +
xlab("Location District") +
ylab("Total")
We created some bar plots to see the number of occurrences per type
of crime.
The Bar plots of crimes in each year are as follows:
ggplot(subset(DF_WM,Arrest_Year == 2016), aes(forcats::fct_infreq(Arrest_Category))) +
ggtitle("Figure 4: Bar plot of crimes in 2016") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest_Year == 2017), aes(forcats::fct_infreq(Arrest_Category))) +
ggtitle("Figure 5: Bar plot of crimes in 2017") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest_Year == 2018), aes(forcats::fct_infreq(Arrest_Category))) +
ggtitle("Figure 6: Bar plot of crimes in 2018") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest_Year == 2019), aes(forcats::fct_infreq(Arrest_Category))) +
ggtitle("Figure 7: Bar plot of crimes in 2019") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest_Year == 2020), aes(forcats::fct_infreq(Arrest_Category))) +
ggtitle("Figure 8: Bar plot of crimes in 2020") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest_Year == 2021), aes(forcats::fct_infreq(Arrest_Category))) +
ggtitle("Figure 9: Bar plot of crimes in 2021") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
“Offenses Against Family & Children” have been increasing after COVD-19.
cnt_2016 <- table(subset(DF_WM,Arrest_Year==2016)$Arrest_Category)
pos_2016 <- order(cnt_2016, decreasing = TRUE)[1:6]
crime_2016 <- names(cnt_2016)[pos_2016]
cnt_2017 <- table(subset(DF_WM,Arrest_Year==2017)$Arrest_Category)
pos_2017 <- order(cnt_2017, decreasing = TRUE)[1:7]
crime_2017 <- names(cnt_2017)[pos_2017]
cnt_2018 <- table(subset(DF_WM,Arrest_Year==2018)$Arrest_Category)
pos_2018 <- order(cnt_2018, decreasing = TRUE)[1:6]
crime_2018 <- names(cnt_2018)[pos_2018]
cnt_2019 <- table(subset(DF_WM,Arrest_Year==2019)$Arrest_Category)
pos_2019 <- order(cnt_2019, decreasing = TRUE)[1:7]
crime_2019 <- names(cnt_2019)[pos_2019]
cnt_2020 <- table(subset(DF_WM,Arrest_Year==2020)$Arrest_Category)
pos_2020 <- order(cnt_2020, decreasing = TRUE)[1:7]
crime_2020 <- names(cnt_2020)[pos_2020]
cnt_2021 <- table(subset(DF_WM,Arrest_Year==2021)$Arrest_Category)
pos_2021 <- order(cnt_2021, decreasing = TRUE)[1:7]
crime_2021 <- names(cnt_2021)[pos_2021]
The top 6 crimes (or 7 crimes when ‘Other Crimes’ are included) in each year are as follows.
| Rank | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 |
|---|---|---|---|---|---|---|
| 1 | Simple Assault | Simple Assault | Simple Assault | Simple Assault | Simple Assault | Simple Assault |
| 2 | Traffic Violations | Traffic Violations | Traffic Violations | Traffic Violations | Driving/Boating While Intoxicated | Traffic Violations |
| 3 | Release Violations/Fugitive | Release Violations/Fugitive | Release Violations/Fugitive | Prostitution | Release Violations/Fugitive | Driving/Boating While Intoxicated |
| 4 | Driving/Boating While Intoxicated | Driving/Boating While Intoxicated | Driving/Boating While Intoxicated | Driving/Boating While Intoxicated | Traffic Violations | Release Violations/Fugitive |
| 5 | Liquor Law Violations | Other Crimes | Narcotics | Release Violations/Fugitive | Offenses Against Family & Children | Other Crimes |
| 6 | Narcotics | Disorderly Conduct | Theft | Other Crimes | Other Crimes | Offenses Against Family & Children |
| 7 | NA | Liquor Law Violations | NA | Theft | Narcotics | Damage to Property |
To see the trend of the above major crimes, we created a line plot as follows.
SA_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Simple Assault',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Simple Assault',]$Arrest_Year==x)})
TV_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Traffic Violations',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Traffic Violations',]$Arrest_Year==x)})
RV_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Release Violations/Fugitive',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Release Violations/Fugitive',]$Arrest_Year==x)})
DI_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Driving/Boating While Intoxicated',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Driving/Boating While Intoxicated',]$Arrest_Year==x)})
N_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Narcotics',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Narcotics',]$Arrest_Year==x)})
LV_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Liquor Law Violations',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Liquor Law Violations',]$Arrest_Year==x)})
T_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Theft',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Theft',]$Arrest_Year==x)})
DV_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Offenses Against Family & Children',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Offenses Against Family & Children',]$Arrest_Year==x)})
DC_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Disorderly Conduct',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Disorderly Conduct',]$Arrest_Year==x)})
P_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Prostitution',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Prostitution',]$Arrest_Year==x)})
DP_cnt <- sapply(unique(DF_WM[DF_WM$Arrest_Category=='Damage to Property',]$Arrest_Year),
function(x){sum(DF_WM[DF_WM$Arrest_Category=='Damage to Property',]$Arrest_Year==x)})
year_lst <- 2016:2021
major_crimes_df <- data.frame(year_lst, SA_cnt, TV_cnt, RV_cnt, DI_cnt, N_cnt, LV_cnt, T_cnt, DV_cnt, DC_cnt, P_cnt, DP_cnt)
colnames(major_crimes_df) <- c('Year', 'Simple Assault', 'Traffic Violations', 'Release Violations/Fugitive', 'Driving/Boating While Intoxicated',
'Narcotics', 'Liquor Law Violations', 'Theft', 'Offenses Against Family & Children', 'Disorderly Conduct', 'Prostitution', 'Damage to Property')
major_crimes_df2 <- major_crimes_df %>% gather(key = 'Crimes', value = "Count", -Year)
ggplot(data=major_crimes_df2, aes(x=Year, y=Count, color=Crimes)) +
geom_line() + geom_point()
“Simple Assault”, “Traffic Violations”, and “Theft” have clearly declined since 2020. On the other, “Offenses Against Family & Children” has increased in 2020 and 2021 compared to previous years. COVID-19 seems to be related to these trend change. We posed the following SMART QUESTION, and we will analyze these four crimes in detail in the following.
Is there a significant difference in “Simple Assault”, “Traffic Violations”, “Theft”, and “Offenses Against Family & Children” trends among adult white males within the DC area between 2016 and 2021, and could COVID protocols play a role in these trend shifts?
Since crime is likely to be a rare event, the number of occurrences per day of a given crime is expected to follow Poisson distribution. Poisson distribution is a distribution used to describe the distribution of the number of rare phenomena when a large number of them are observed. If a distribution follows Poisson distribution, and the average number of occurrences of the phenomenon is \(\lambda\), the probability that the phenomenon will occur \(x\) times is given by \[p(x) = \exp(-\lambda)\frac{\lambda^{x}}{x!}.\] In the following, we will estimate \(\lambda\) of each crime before and after COVID-19 to see there is a difference in crime trend.
The trend of “Offenses Against Family & Children,” Domestic Violence (DV), appears to have changed after COVID-19. The frequency table of DV before COVID-19 is as follows.
DF_WM_16_19 <- DF_WM[DF_WM$Arrest_Year%in%c(2016,2017,2018,2019),]
DF_WM_16_19_DV <- DF_WM_16_19[DF_WM_16_19$Arrest_Category=='Offenses Against Family & Children',]
# table of date and the number of occurrences
DV_day_16_19 <- sapply(unique(DF_WM_16_19_DV$Arrest_Date),
function(x){sum(DF_WM_16_19_DV$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 1364 | 0.9336071 |
| 1 | 95 | 0.065024 |
| 2 | 2 | 0.0013689 |
| 3 | 0 | 0 |
We can calculate \(\lambda\) from the above table and \(\lambda = 0.0678\). We will plot the histogram and Poisson distribution with \(\lambda = 0.0678\) to check if they match or not.
x_DV <- 0:5
y_DV <- c(1364,95,2,0,0,0)
fx <- dpois(x=x_DV, lambda=99/(365*4+1))
data_DV <- data.frame(x_DV, y_DV, fx)
ggplot(data_DV, aes(x=x_DV,y=y_DV)) +
ggtitle("Figure 13: Histogram of DV in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_DV) +
ggtitle("Figure 14: Relative frequency histogram of DV in 2016 - 2019 \n and Poisson distribution with lambda = 0.0678") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_DV,y=y_DV/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_DV,y=fx), color='red') +
geom_point(aes(x=x_DV,y=fx), color='red')
We can see that the Poisson distribution fits well with the histogram.
Next, we try to estimate \(99\%\) Confidence Interval of \(\lambda\). The variance of Poisson distribution is equal to its mean (\(\lambda\)). Therefore, \(99\%\) Confidence Interval of \(\lambda\) can be written as \[ \bar{x} - z_{*}\cdot\sqrt{\frac{\bar{x}}{n}} \leq \lambda \leq \bar{x} + z_{*}\cdot\sqrt{\frac{\bar{x}}{n}}, \] where \(\bar{x}\) is the sample mean, \(n\) is the sample size, and \(z_*\) is z-value corresponding to the \(99\%\) confidence interval, and the value is 2.58. From this expression, 99% Confidence Interval of \(\lambda\) for DV before COVID-19 is [0.05, 0.0856].
The frequency table of DV after COVID-19 is as follows.
DF_WM_20_21 <- DF_WM[DF_WM$Arrest_Year%in%c(2020,2021),]
DF_WM_20_21_DV <- DF_WM_20_21[DF_WM_20_21$Arrest_Category=='Offenses Against Family & Children',]
# table of date and the number of occurrences
DV_day_20_21 <- sapply(unique(DF_WM_20_21_DV$Arrest_Date),
function(x){sum(DF_WM_20_21_DV$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 680 | 0.9302326 |
| 1 | 47 | 0.0642955 |
| 2 | 1 | 0.001368 |
| 3 | 0 | 0 |
| 4 | 1 | 0.001368 |
| 5 | 0 | 0 |
| … | 0 | 0 |
| 44 | 0 | 0 |
| 45 | 1 | 0.001368 |
| 46 | 0 | 0 |
| … | 0 | 0 |
| 77 | 0 | 0 |
| 78 | 1 | 0.001368 |
| 79 | 0 | 0 |
| … | 0 | 0 |
There are two outliers (45 and 78) in the table. The dates of them are 1 and 1. Since these dates are correspond to “Capitol attack” and “George Floyd protests”, we will drop the value of these dates.
The calculated \(\lambda = 0.0725\). The histogram and the poisson distribution with \(\lambda = 0.0725\) are shown in Figure 16.
x_DV <- 0:5
y_DV <- c(680,47,1,0,1,0)
fx <- dpois(x=x_DV, lambda=53/(365*2+1))
data_DV <- data.frame(x_DV, y_DV, fx)
ggplot(data_DV, aes(x=x_DV,y=y_DV)) +
ggtitle("Figure 15: Histogram of DV in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_DV) +
ggtitle("Figure 16: Reralive frequency histogram of DV in 2020 - 2021 \n and Pission distribution with lambda = 0.0725") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_DV,y=y_DV/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_DV,y=fx), color='red') +
geom_point(aes(x=x_DV,y=fx), color='red')
The Poisson distribution fits well with the histogram.
99% Confidence Interval of \(\lambda\) for DV after COVID-19 is [0.0465, 0.0985].
Figure 17 shows the Confidence Intervals before and after COVID-19. There was overlap in the Confidence Intervals, and it is not possible to say that there was a change in the \(\lambda\) of “Offenses Against Family & Children” before or after COVID-19.
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(99/(365*4+1)-2.58*(99/(365*4+1)/(356*4+1))**0.5, 99/(365*4+1)+ 2.58*(99/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(53/(365*2+1) - 2.58*(53/(365*2+1)/(356*2+1))**0.5, 53/(365*2+1) + 2.58*(53/(365*2+1)/(356*2+1))**0.5)
data_CI_DV <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_DV) +
ggtitle("Figure 17: 99% Confidence Interval of lambda for DV") +
xlab("") +
ylab("99% Confidence Interval of lambda") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
The trend of “Traffic Violations” also appears to have changed after COVID-19. The frequency table of Traffic Violations before COVID-19 is as follows.
DF_WM_16_19_TV <- DF_WM_16_19[DF_WM_16_19$Arrest_Category=='Traffic Violations',]
# table of date and the number of occurrences
TV_day_16_19 <- sapply(unique(DF_WM_16_19_TV$Arrest_Date),
function(x){sum(DF_WM_16_19_TV$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 602 | 0.4123288 |
| 1 | 530 | 0.3627652 |
| 2 | 225 | 0.1540041 |
| 3 | 77 | 0.0527036 |
| 4 | 22 | 0.0150582 |
| 5 | 4 | 0.0027379 |
| 6 | 1 | 6.844627^{-4} |
| 7 | 0 | 0 |
The calculated \(\lambda = 0.907\). The histogram and the poisson distribution with \(\lambda = 0.907\) are shown in Figure 19.
x_TV <- 0:10
y_TV <- c(602,530,225,77,22,4,1,0,0,0,0)
fx <- dpois(x=x_TV, lambda=sum(TV_day_16_19)/(365*4+1))
data_TV <- data.frame(x_TV, y_TV, fx)
ggplot(data_TV, aes(x=x_TV,y=y_TV)) +
ggtitle("Figure 18: Histogram of traffic violations in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_TV) +
ggtitle("Figure 19: Relative frequency histogram of traffic violations in 2016 - 2019 \n and Pission distribution with lambda = 0.907") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_TV,y=y_TV/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_TV,y=fx), color="red") +
geom_point(aes(x=x_TV,y=fx), color='red')
The Poisson distribution fits well with the histogram.
99% Confidence Interval of \(\lambda\) for Traffic Violations before COVID-19 is [0.842, 0.972].
The frequency table of Traffic Violations after COVID-19 is as follows.
DF_WM_20_21_TV <- DF_WM_20_21[DF_WM_20_21$Arrest_Category=='Traffic Violations',]
# table of date and the number of occurrences
TV_day_20_21 <- sapply(unique(DF_WM_20_21_TV$Arrest_Date),
function(x){sum(DF_WM_20_21_TV$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 546 | 0.746922 |
| 1 | 156 | 0.2134063 |
| 2 | 23 | 0.0314637 |
| 3 | 3 | 0.004104 |
| 4 | 3 | 0.004104 |
| 5 | 0 | 0 |
The calculated \(\lambda = 0.306\). The histogram and the poisson distribution with \(\lambda = 0.306\) are shown in Figure 21.
x_TV <- 0:10
y_TV <- c(546,156,23,3,3,0,0,0,0,0,0)
fx <- dpois(x=x_TV, lambda=sum(TV_day_20_21)/(365*2+1))
data_TV <- data.frame(x_TV, y_TV, fx)
ggplot(data_TV, aes(x=x_TV,y=y_TV)) +
ggtitle("Figure 20: Histogram of traffic violations in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_TV) +
ggtitle("Figure 21: Relative frequency histogram of traffic violations in 2020 - 2021 \n and Pission distribution with lambda = 0.306") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_TV,y=y_TV/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_TV,y=fx), color="red") +
geom_point(aes(x=x_TV,y=fx), color='red')
The Poisson distribution fits well with the histogram.
99% Confidence Interval of \(\lambda\) for Traffic Violations before COVID-19 is [0.252, 0.358].
Figure 22 shows the Confidence Intervals before and after COVID-19. There was no overlap in the Confidence Intervals, and there may have been a change in the Traffic Violations lambda before and after COVID-19.
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(TV_day_16_19)/(365*4+1) - 2.58*(sum(TV_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(TV_day_16_19)/(365*4+1) + 2.58*(sum(TV_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(TV_day_20_21)/(365*2+1) - 2.58*(sum(TV_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(TV_day_20_21)/(365*2+1) + 2.58*(sum(TV_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_TV <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_TV) +
ggtitle("Figure 22: 99% Confidence Interval of lambda for Traffic Violations") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
The below table shows the frequency and relative frequency of Simple Assault before COVID-19.
DF_WM_16_19_SA <- DF_WM_16_19[DF_WM_16_19$Arrest_Category=='Simple Assault',]
# table of date and the number of occurrences
SA_day_16_19 <- sapply(unique(DF_WM_16_19_SA$Arrest_Date),
function(x){sum(DF_WM_16_19_SA$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 438 | 0.2997947 |
| 1 | 479 | 0.3278576 |
| 2 | 284 | 0.1943874 |
| 3 | 156 | 0.1067762 |
| 4 | 76 | 0.0520192 |
| 5 | 13 | 0.008898 |
| 6 | 8 | 0.0054757 |
| 7 | 4 | 0.0027379 |
| 8 | 1 | 6.844627^{-4} |
| 9 | 2 | 0.0013689 |
| 10 | 0 | 0 |
We got \(\lambda = 1.36\) by calculating the average of occurrences per day.
x_SA <- 0:10
y_SA <- c(438,479,284,156,76,13,8,4,1,2,0)
fx <- dpois(x=x_SA, lambda=sum(SA_day_16_19)/(365*4+1))
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA, aes(x=x_SA,y=y_SA)) +
ggtitle("Figure 23: Histogram of Simple Assault in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_SA) +
ggtitle("Figure 24: Relative frequency histogram of Simple Assault in 2016 - 2019 \n and Poisson distribution with lambda = 1.36") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
The frequency and relative frequency in 2020 and 2021 is shown in below. The \(\lambda\) for 2020 and 2021 was \(0.923\).
DF_WM_20_21_SA <- DF_WM_20_21[DF_WM_20_21$Arrest_Category=='Simple Assault',]
# table of date and the number of occurrences
SA_day_20_21 <- sapply(unique(DF_WM_20_21_SA$Arrest_Date),
function(x){sum(DF_WM_20_21_SA$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 297 | 0.4062927 |
| 1 | 267 | 0.3652531 |
| 2 | 112 | 0.1532148 |
| 3 | 43 | 0.0588235 |
| 4 | 8 | 0.0109439 |
| 5 | 3 | 0.004104 |
| 6 | 0 | 0 |
| 7 | 0 | 0 |
| 8 | 1 | 0.001368 |
| 9 | 0 | 0 |
x_SA <- 0:10
y_SA <- c(297,267,112,43,8,3,0,0,1,0,0)
fx <- dpois(x=x_SA, lambda=sum(SA_day_20_21)/(365*2+1))
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA, aes(x=x_SA,y=y_SA)) +
ggtitle("Figure 25: Histogram of Simple Assault in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_SA) +
ggtitle("Figure 25: Relative frequency histogram of Simple Assault in 2020 - 2021 \n and Poisson distribution with lambda = 0.923") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
Figure 26 shows the Confidence Intervals before and after COVID-19. There was no overlap in the Confidence Intervals, and there may have been a change in the Simple Assault lambda before and after COVID-19.
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(SA_day_16_19)/(365*4+1) - 2.58*(sum(SA_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(SA_day_16_19)/(365*4+1) + 2.58*(sum(SA_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(SA_day_20_21)/(365*2+1) - 2.58*(sum(SA_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(SA_day_20_21)/(365*2+1) + 2.58*(sum(SA_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_SA <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_SA) +
ggtitle("Figure 26: 99% Confidence Interval of lambda for Simple Assault") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
The frequency and relative frequency in 2016 to 2019 is shown in below. The \(\lambda\) before COVID-19 was \(0.404\).
DF_WM_16_19_T <- DF_WM_16_19[DF_WM_16_19$Arrest_Category=='Theft',]
# table of date and the number of occurrences
T_day_16_19 <- sapply(unique(DF_WM_16_19_T$Arrest_Date),
function(x){sum(DF_WM_16_19_T$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 973 | 0.6659822 |
| 1 | 398 | 0.2724162 |
| 2 | 79 | 0.0540726 |
| 3 | 10 | 0.0068446 |
| 4 | 1 | 6.844627^{-4} |
| 5 | 0 | 0 |
x_T <- 0:5
y_T <- c(973,398,79,10,1,0)
fx <- dpois(x=x_T, lambda=sum(T_day_16_19)/(365*4+1))
data_T <- data.frame(x_T, y_T, fx)
ggplot(data_T, aes(x=x_T,y=y_T)) +
ggtitle("Figure 27: Histogram of Theft in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_T) +
ggtitle("Figure 28: Relative frequency histogram of Theft in 2016 - 2019 \n and Poisson distribution with lambda = 0.404") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_T,y=y_T/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_T,y=fx), color='red') +
geom_point(aes(x=x_T,y=fx), color='red')
The frequency and relative frequency in 2020 and 2021 are shown in below. The \(\lambda\) for 2020 and 2021 was \(0.115\).
DF_WM_20_21_T <- DF_WM_20_21[DF_WM_20_21$Arrest_Category=='Theft',]
# table of date and the number of occurrences
T_day_20_21 <- sapply(unique(DF_WM_20_21_T$Arrest_Date),
function(x){sum(DF_WM_20_21_T$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 653 | 0.8932969 |
| 1 | 72 | 0.0984952 |
| 2 | 6 | 0.0082079 |
| 3 | 0 | 0 |
x_T <- 0:3
y_T <- c(653,72,6,0)
fx <- dpois(x=x_T, lambda=sum(T_day_20_21)/(365*2+1))
data_T <- data.frame(x_T, y_T, fx)
ggplot(data_T, aes(x=x_T,y=y_T)) +
ggtitle("Figure 29: Histogram of Theft in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_T) +
ggtitle("Figure 30: Relative frequency histogram of Theft in 2020 - 2021 \n and Poisson distribution with lambda = 0.115") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_T,y=y_T/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_T,y=fx), color='red') +
geom_point(aes(x=x_T,y=fx), color='red')
Figure 31 shows the Confidence Intervals before and after COVID-19. There was no overlap in the Confidence Intervals, and there may have been a change in the Theft lambda before and after COVID-19.
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(T_day_16_19)/(365*4+1) - 2.58*(sum(T_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(T_day_16_19)/(365*4+1) + 2.58*(sum(T_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(T_day_20_21)/(365*2+1) - 2.58*(sum(T_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(T_day_20_21)/(365*2+1) + 2.58*(sum(T_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_T <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_T) +
ggtitle("Figure 31: 99% Confidence Interval of lambda for Theft") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
Statistically significant reductions in Simple Assault and Traffic Violations were observed for \(\lambda\) before and after COVID-19. Since these crimes seem to be more likely to occur the more people are out, it is likely that the restrictions and curbs on going out due to COVID-19 contributed to the decrease in these crimes.
A statistically significant decrease in theft was also observed in \(\lambda\) before and after Corona. Considering that thefts are committed against empty homes, the decrease in empty homes due to the curfew restrictions caused by COVID-19 may have contributed to the decrease in thefts.
The more time one spends at home due, the more Offenses Against Family & Children are likely to increase. In fact, in terms of the number of cases alone, Offenses Against Family & Children have increased after COVID-19. At first glance, the curfew restrictions caused by COVID-19 seems to be the cause. However, most of these cases were caused by special incidents unrelated to COVID-19, and when these effects were removed, there was no statistically significant difference in the change in Offenses Against Family & Children before and after COVID-19. As for white males in the DC area, Offenses Against Family & Children to the point of arrest does not appear to be affected by the changes in their lives caused by COVID-19.
DF_WM_16_19_RV <- DF_WM_16_19[DF_WM_16_19$Arrest_Category=='Release Violations/Fugitive',]
# table of date and the number of occurrences
RV_day_16_19 <- sapply(unique(DF_WM_16_19_RV$Arrest_Date),
function(x){sum(DF_WM_16_19_RV$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 817 | 0.559206 |
| 1 | 428 | 0.29295 |
| 2 | 168 | 0.1149897 |
| 3 | 40 | 0.0273785 |
| 4 | 5 | 0.0034223 |
| 5 | 3 | 0.0020534 |
| 6 | 0 | 0 |
x_RV <- 0:6
y_RV <- c(817,428,168,40,5,3,0)
fx <- dpois(x=x_RV, lambda=sum(RV_day_16_19)/(365*4+1))
data_RV <- data.frame(x_RV, y_RV, fx)
ggplot(data_RV, aes(x=x_RV,y=y_RV)) +
ggtitle("Figure : Histogram of Release Violations/Fugitive in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_RV) +
ggtitle("Figure : Relative frequency histogram of Release Violations/Fugitive in 2016 - 2019 \n and Poisson distribution with lambda = 0.629") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_RV,y=y_RV/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_RV,y=fx), color='red') +
geom_point(aes(x=x_RV,y=fx), color='red')
DF_WM_20_21_RV <- DF_WM_20_21[DF_WM_20_21$Arrest_Category=='Release Violations/Fugitive',]
# table of date and the number of occurrences
RV_day_20_21 <- sapply(unique(DF_WM_20_21_RV$Arrest_Date),
function(x){sum(DF_WM_20_21_RV$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 549 | 0.751026 |
| 1 | 154 | 0.2106703 |
| 2 | 24 | 0.0328317 |
| 3 | 4 | 0.005472 |
| 4 | 0 | 0 |
x_RV <- 0:4
y_RV <- c(549,154,24,4,0)
fx <- dpois(x=x_RV, lambda=sum(RV_day_20_21)/(365*2+1))
data_RV <- data.frame(x_RV, y_RV, fx)
ggplot(data_RV, aes(x=x_RV,y=y_RV)) +
ggtitle("Figure : Histogram of Release Violations/Fugitive in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_RV) +
ggtitle("Figure : Relative frequency histogram of Release Violations/Fugitive in 2020 - 2021 \n and Poisson distribution with lambda = 0.293") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_RV,y=y_RV/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_RV,y=fx), color='red') +
geom_point(aes(x=x_RV,y=fx), color='red')
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(RV_day_16_19)/(365*4+1) - 2.58*(sum(RV_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(RV_day_16_19)/(365*4+1) + 2.58*(sum(RV_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(RV_day_20_21)/(365*2+1) - 2.58*(sum(RV_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(RV_day_20_21)/(365*2+1) + 2.58*(sum(RV_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_RV <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_RV) +
ggtitle("Figure : 99% Confidence Interval of lambda for Release Violations/Fugitive") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
DF_WM_16_19_DI <- DF_WM_16_19[DF_WM_16_19$Arrest_Category=='Driving/Boating While Intoxicated',]
# table of date and the number of occurrences
DI_day_16_19 <- sapply(unique(DF_WM_16_19_DI$Arrest_Date),
function(x){sum(DF_WM_16_19_DI$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 875 | 0.5989049 |
| 1 | 416 | 0.2847365 |
| 2 | 138 | 0.0944559 |
| 3 | 25 | 0.0171116 |
| 4 | 6 | 0.0041068 |
| 5 | 1 | 6.844627^{-4} |
| 6 | 0 | 0 |
x_DI <- 0:6
y_DI <- c(875,416,138,25,6,1,0)
fx <- dpois(x=x_DI, lambda=sum(DI_day_16_19)/(365*4+1))
data_DI <- data.frame(x_DI, y_DI, fx)
ggplot(data_DI, aes(x=x_DI,y=y_DI)) +
ggtitle("Figure : Histogram of Driving/Boating While Intoxicated in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_DI) +
ggtitle("Figure : Relative frequency histogram of Driving/Boating While Intoxicated in 2016 - 2019 \n and Poisson distribution with lambda = 0.545") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_DI,y=y_DI/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_DI,y=fx), color='red') +
geom_point(aes(x=x_DI,y=fx), color='red')
DF_WM_20_21_DI <- DF_WM_20_21[DF_WM_20_21$Arrest_Category=='Driving/Boating While Intoxicated',]
# table of date and the number of occurrences
DI_day_20_21 <- sapply(unique(DF_WM_20_21_DI$Arrest_Date),
function(x){sum(DF_WM_20_21_DI$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 527 | 0.7209302 |
| 1 | 168 | 0.2298222 |
| 2 | 29 | 0.0396717 |
| 3 | 5 | 0.0068399 |
| 4 | 2 | 0.002736 |
| 5 | 0 | 0 |
x_DI <- 0:5
y_DI <- c(527,168,29,5,2,0)
fx <- dpois(x=x_DI, lambda=sum(DI_day_20_21)/(365*2+1))
data_DI <- data.frame(x_DI, y_DI, fx)
ggplot(data_DI, aes(x=x_DI,y=y_DI)) +
ggtitle("Figure : Histogram of Driving/Boating While Intoxicated in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_DI) +
ggtitle("Figure : Relative frequency histogram of Driving/Boating While Intoxicated in 2020 - 2021 \n and Poisson distribution with lambda = 0.341") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_DI,y=y_DI/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_DI,y=fx), color='red') +
geom_point(aes(x=x_DI,y=fx), color='red')
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(DI_day_16_19)/(365*4+1) - 2.58*(sum(DI_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(DI_day_16_19)/(365*4+1) + 2.58*(sum(DI_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(DI_day_20_21)/(365*2+1) - 2.58*(sum(DI_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(DI_day_20_21)/(365*2+1) + 2.58*(sum(DI_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_DI <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_DI) +
ggtitle("Figure : 99% Confidence Interval of lambda for Driving/Boating While Intoxicated") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
DF_WM_16_19_N <- DF_WM_16_19[DF_WM_16_19$Arrest_Category=='Narcotics',]
# table of date and the number of occurrences
N_day_16_19 <- sapply(unique(DF_WM_16_19_N$Arrest_Date),
function(x){sum(DF_WM_16_19_N$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 1053 | 0.7207392 |
| 1 | 318 | 0.2176591 |
| 2 | 66 | 0.0451745 |
| 3 | 10 | 0.0068446 |
| 4 | 5 | 0.0034223 |
| 5 | 6 | 0.0041068 |
| 6 | 0 | 0 |
| 7 | 1 | 6.844627^{-4} |
| 8 | 0 | 0 |
| 9 | 1 | 6.844627^{-4} |
| 10 | 0 | 0 |
| 11 | 0 | 0 |
| 12 | 0 | 0 |
| 13 | 1 | 6.844627^{-4} |
| 14 | 0 | 0 |
x_N <- 0:14
y_N <- c(1053,318,66,10,5,6,0,1,0,1,0,0,0,1,0)
fx <- dpois(x=x_N, lambda=sum(N_day_16_19)/(365*4+1))
data_N <- data.frame(x_N, y_N, fx)
ggplot(data_N, aes(x=x_N,y=y_N)) +
ggtitle("Figure : Histogram of Narcotics in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_N) +
ggtitle("Figure : Relative frequency histogram of Narcotics in 2016 - 2019 \n and Poisson distribution with lambda = 0.383") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_N,y=y_N/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_N,y=fx), color='red') +
geom_point(aes(x=x_N,y=fx), color='red')
DF_WM_20_21_N <- DF_WM_20_21[DF_WM_20_21$Arrest_Category=='Narcotics',]
# table of date and the number of occurrences
N_day_20_21 <- sapply(unique(DF_WM_20_21_N$Arrest_Date),
function(x){sum(DF_WM_20_21_N$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 652 | 0.8919289 |
| 1 | 66 | 0.0902873 |
| 2 | 10 | 0.0136799 |
| 3 | 3 | 0.004104 |
| 4 | 0 | 0 |
x_N <- 0:4
y_N <- c(652,66,10,3,0)
fx <- dpois(x=x_N, lambda=sum(N_day_20_21)/(365*2+1))
data_N <- data.frame(x_N, y_N, fx)
ggplot(data_N, aes(x=x_N,y=y_N)) +
ggtitle("Figure : Histogram of Narcotics in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_N) +
ggtitle("Figure : Relative frequency histogram of Narcotics in 2020 - 2021 \n and Poisson distribution with lambda = 0.13") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_N,y=y_N/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_N,y=fx), color='red') +
geom_point(aes(x=x_N,y=fx), color='red')
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(N_day_16_19)/(365*4+1) - 2.58*(sum(N_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(N_day_16_19)/(365*4+1) + 2.58*(sum(N_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(N_day_20_21)/(365*2+1) - 2.58*(sum(N_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(N_day_20_21)/(365*2+1) + 2.58*(sum(N_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_N <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_N) +
ggtitle("Figure : 99% Confidence Interval of lambda for Narcotics") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
DF_WM_16_19_LV <- DF_WM_16_19[DF_WM_16_19$Arrest_Category=='Liquor Law Violations',]
# table of date and the number of occurrences
LV_day_16_19 <- sapply(unique(DF_WM_16_19_LV$Arrest_Date),
function(x){sum(DF_WM_16_19_LV$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 1090 | 0.7460643 |
| 1 | 259 | 0.1772758 |
| 2 | 76 | 0.0520192 |
| 3 | 32 | 0.0219028 |
| 4 | 3 | 0.0020534 |
| 5 | 1 | 6.844627^{-4} |
| 6 | 0 | 0 |
x_LV <- 0:6
y_LV <- c(1090,259,76,32,3,1,0)
fx <- dpois(x=x_LV, lambda=sum(LV_day_16_19)/(365*4+1))
data_LV <- data.frame(x_LV, y_LV, fx)
ggplot(data_LV, aes(x=x_LV,y=y_LV)) +
ggtitle("Figure : Histogram of Liquor Law Violations in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_LV) +
ggtitle("Figure : Relative frequency histogram of Liquor Law Violations in 2016 - 2019 \n and Poisson distribution with lambda = 0.359") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_LV,y=y_LV/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_LV,y=fx), color='red') +
geom_point(aes(x=x_LV,y=fx), color='red')
DF_WM_20_21_LV <- DF_WM_20_21[DF_WM_20_21$Arrest_Category=='Liquor Law Violations',]
# table of date and the number of occurrences
LV_day_20_21 <- sapply(unique(DF_WM_20_21_LV$Arrest_Date),
function(x){sum(DF_WM_20_21_LV$Arrest_Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 699 | 0.9562244 |
| 1 | 27 | 0.0369357 |
| 2 | 4 | 0.005472 |
| 3 | 1 | 0.001368 |
| 4 | 0 | 0 |
x_LV <- 0:4
y_LV <- c(699,27,4,1,0)
fx <- dpois(x=x_LV, lambda=sum(LV_day_20_21)/(365*2+1))
data_LV <- data.frame(x_LV, y_LV, fx)
ggplot(data_LV, aes(x=x_LV,y=y_LV)) +
ggtitle("Figure : Histogram of Liquor Law Violations in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_LV) +
ggtitle("Figure : Relative frequency histogram of Liquor Law Violations in 2020 - 2021 \n and Poisson distribution with lambda = 0.052") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_LV,y=y_LV/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_LV,y=fx), color='red') +
geom_point(aes(x=x_LV,y=fx), color='red')
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(LV_day_16_19)/(365*4+1) - 2.58*(sum(LV_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(LV_day_16_19)/(365*4+1) + 2.58*(sum(LV_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(LV_day_20_21)/(365*2+1) - 2.58*(sum(LV_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(LV_day_20_21)/(365*2+1) + 2.58*(sum(LV_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_LV <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_LV) +
ggtitle("Figure : 99% Confidence Interval of lambda for Liquor Law Violations") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))